library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(readr)
library(viridis)
## Loading required package: viridisLite
library(leaflet)
knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 6,
  out.width = "90%"
)
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
theme_set(theme_minimal() + theme(legend.position = "bottom"))
data_2018 = 
  read_csv("./data/2018data.csv") %>% 
  janitor::clean_names() 
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   TIME = col_time(format = ""),
##   `ZIP CODE` = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   `NUMBER OF PERSONS INJURED` = col_double(),
##   `NUMBER OF PERSONS KILLED` = col_double(),
##   `NUMBER OF PEDESTRIANS INJURED` = col_double(),
##   `NUMBER OF PEDESTRIANS KILLED` = col_double(),
##   `NUMBER OF CYCLIST INJURED` = col_double(),
##   `NUMBER OF CYCLIST KILLED` = col_double(),
##   `NUMBER OF MOTORIST INJURED` = col_double(),
##   `NUMBER OF MOTORIST KILLED` = col_double(),
##   COLLISION_ID = col_double()
## )
## See spec(...) for full column specifications.
newnames = colnames(data_2018) %>% 
  str_replace("number_of_","") 
names(data_2018) = newnames

tidy_data = 
data_2018 %>% 
  separate(date, into = c("month", "day", "year"), sep = "/") %>% 
  separate(time, into = c("hour", "minute"), sep = ":") %>%
  select(-zip_code, -location, -on_street_name, -cross_street_name, -off_street_name,-collision_id,-year) %>% 
  rename("vehicle_type" = "vehicle_type_code_1") %>% 
  mutate( day = as.numeric(day),
          month = as.numeric(month),
          hour = as.numeric(hour),
          minute = as.numeric(minute),
          latitude = replace_na(latitude,0),
          vehicle_type = str_to_lower(vehicle_type)
  ) %>%
  filter( latitude != 0)

Vehicle type

vehicle_type_data = 
  tidy_data %>% 
  mutate(
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"truck"),"truck"),
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"sport utility"),"sport utility vehicle")
    ) %>% 
  filter( vehicle_type %in% c("taxi","passenger vehicle","truck","sport utility vehicle")) %>% 
  group_by(vehicle_type,hour) %>% 
  summarize(
    n = n()
  )

vehicle_type_data %>% 
  plot_ly(
    x = ~hour, y = ~n, color = ~vehicle_type, type = "scatter", mode = "line") %>% 
  layout(
    title = "Collisions of Day for Different Vehicles",
    xaxis = list(title = "Hour of Day"),
    yaxis = list(title = "Collisions")
    )

Top 8 Collision Reasons

reason_data = 
  tidy_data %>%
  group_by(contributing_factor_vehicle_1) %>%
  summarize(n = n()) %>% 
  arrange(desc(n)) %>% 
  head(10)

reason_data %>% 
  plot_ly(x = ~reorder(contributing_factor_vehicle_1,desc(n)), y = ~n, color = ~contributing_factor_vehicle_1 ,type = "bar") %>% 
  layout(
    title = "The Number of Items Ordered in Each Aisle",
    xaxis = list(title = "Different Reasons"),
    yaxis = list(title = "Count")
    )

Mapping

data_2018 = tidy_data

data_2018 = rename(data_2018, long = latitude, lat = longitude)

pal <- colorNumeric(
  palette = "viridis",
  domain = data_2018$persons_injured)

data_2018 %>% 
  filter(!(lat < "-70" | lat >= "-75")) %>% 
  filter(persons_injured > 2) %>% 
    mutate(
    label = str_c("<b>vehicle type: ", vehicle_type, "</b><br>Month: ", month , sep = "") ) %>% 
  sample_n(2000) %>% 
  leaflet() %>% 
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addLegend("bottomright", pal = pal, values = ~persons_injured,
    title = "Persons Injured",
    opacity = 1
  ) %>% 
  addCircleMarkers(
    ~lat, ~long,
    color = ~pal(persons_injured),
    radius = 0.5,
    popup = ~ label) 
data_2018 %>% 
  group_by(borough) %>% 
  summarise(n())
## # A tibble: 6 x 2
##   borough       `n()`
##   <chr>         <int>
## 1 BRONX         22121
## 2 BROOKLYN      46314
## 3 MANHATTAN     29728
## 4 QUEENS        40400
## 5 STATEN ISLAND  5988
## 6 <NA>          71583

##proportion of accident and injured people by borough, by hour

data_2018 <- 
  read_csv("./data/2018data.csv") %>% 
  janitor::clean_names() 
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   TIME = col_time(format = ""),
##   `ZIP CODE` = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   `NUMBER OF PERSONS INJURED` = col_double(),
##   `NUMBER OF PERSONS KILLED` = col_double(),
##   `NUMBER OF PEDESTRIANS INJURED` = col_double(),
##   `NUMBER OF PEDESTRIANS KILLED` = col_double(),
##   `NUMBER OF CYCLIST INJURED` = col_double(),
##   `NUMBER OF CYCLIST KILLED` = col_double(),
##   `NUMBER OF MOTORIST INJURED` = col_double(),
##   `NUMBER OF MOTORIST KILLED` = col_double(),
##   COLLISION_ID = col_double()
## )
## See spec(...) for full column specifications.
newnames = colnames(data_2018) %>% 
  str_replace("number_of_","") 
names(data_2018) = newnames

tidy_data = 
data_2018 %>% 
  separate(date, into = c("day", "month", "year"), sep="/") %>% 
  separate(time, into = c("hour", "minute"), sep=":") %>%
  select(-zip_code, -location, -on_street_name, -cross_street_name, -off_street_name,-year, -collision_id) %>% 
  mutate( day = as.numeric(day),
          month = as.numeric(month),
          hour = as.numeric(hour),
          minute = as.numeric(minute),
          latitude = replace_na(latitude,0)
  ) %>% 
  filter( latitude != 0)
boro_plot = tidy_data %>% 
  drop_na(borough, persons_injured, persons_killed) %>%
  group_by(borough) %>%
  summarize(accident = n(),
            injured = sum(persons_injured)) %>%
  mutate(prop_accident = round(accident/sum(accident),4),
         prop_injured = round(injured/sum(injured),4)) %>%
  mutate(text_label1 = str_c("accident: ", accident), 
         text_label2 = str_c("injured: ", injured)) %>% 
  plot_ly(x = ~borough, y = ~prop_accident, type = 'bar', name = 'accident', text = ~text_label1, alpha = 0.8) %>%
  add_trace(x = ~borough, y = ~prop_injured, type = 'bar', name = 'injury', text = ~text_label2, alpha = 0.8) %>%
  layout(title="proportion of accident and injured people by borough", xaxis = list(title = 'borough'), barmode = 'group', legend = list(orientation = "h", xanchor = "center", x = 0.5, y = -0.2)) 
boro_plot

Number of accident and injured people in Brooklyn is greatest comparing with other boroughs. Queen is second. Staten Island have fewest accident and injured people. Manhattan and Bronx is in the middle.

hour_table = tidy_data %>% 
  drop_na(hour, persons_injured, persons_killed) %>%
  group_by(hour) %>%
  summarize(accident = n(),
            injured = sum(persons_injured)) 
hour_plot = hour_table %>%
  mutate(prop_accident = round(accident/sum(accident),4),
         prop_injured = round(injured/sum(injured),4)) %>%
  mutate(text_label1 = str_c("accident: ", accident), 
         text_label2 = str_c("injured: ", injured)) %>% 
  plot_ly(x = ~hour, y = ~prop_accident, type = 'scatter', mode = 'lines', name = 'accident', text = ~text_label1, alpha = 0.8) %>%
  add_trace(x = ~hour, y = ~prop_injured, type = 'scatter', mode = 'lines', name = 'injury', text = ~text_label2, alpha = 0.8) %>%
  layout(title="proportion of accident and injured people by hour", xaxis = list(autotick = FALSE, ticks = "outside", tick0 = 0, dtick = 1, title = 'Hour'), yaxis = list(autotick = FALSE, ticks = "outside", tick0 = 0, dtick = 0.01, title = 'proportion'), legend = list(orientation = "h", xanchor = "center", x = 0.5, y = -0.2)) 

hour_plot %>%
  layout(shapes = list(type = "rect", fillcolor = "pink", line = list(color = "pink"), opacity = 0.3, x0 = 8, x1 = 19, xref = "x", y0 = 0, y1 = 0.08, yref = "y"))

Frequency of accident and injured people in the period of 8 am-19 pm is higher than other period of time. Started from 3 am, number of accident and injured people increase and reach a small peak at 8am. At 5pm, it reaches a big peak. then it started to decrease.